Data from here: hhttps://osf.io/nbepu Original methods and full paper here: https://journals.plos.org/plosone/article?id=10.1371/journal.pone.0265662

Load required packages

figures <- "figures/"
library(stringr)
library(dplyr)
library(DescTools)
library(car)
require(foreign)
require(ggplot2)
require(Hmisc)
require(reshape2)
library(tidyverse)
library(broom)
library(aod)
library(glmtoolbox)
library(fmsb)
library(uwot)
library(ggthemes)
library(ggforce)
library(readxl) #leaves characters as characters
library(ROCR) # Use ROCR package to plot ROC curve & AUC
library(pROC)
library(caret)
library(xgboost)
library(smotefamily) # one method of ML data augmentation
library(rsample)
library(mltools) # Wrapper allows UMAP calculations
library(data.table)
library(heatmaply)
library(rstatix) # to calculate Kendall correlation with associated p-values.
library(IHW) # Bioconductor package to calculate BH threshold
library(dbscan)
library(corrplot)
library(showtext)
knitr::opts_chunk$set(tidy.opts = list(width.cutoff = 60), tidy = TRUE)
library(splines)
library(cluster)
library(factoextra)
library(clusterSim)
library(RColorBrewer)

Open Excel file

# Save the data so can run subsequent analysis ad lib
SAVE = TRUE

# quicker version readxl
ml <- read_excel("Canine health results.xlsx", sheet = "All")
cols <- c("Col_a")
for (col in 2:ncol(ml)) {
    if (col <= 26) {
        cols <- cbind(cols, paste0("Col_", letters[col]))
    } else if (col <= 52) {
        cols <- cbind(cols, paste0("Col_a", letters[col - 26]))
    } else {
        cols <- cbind(cols, paste0("Col_b", letters[col - 52]))
    }
}
names(ml) <- cols

Rename variables

ml <- dplyr::rename(ml, unique.resp.number = Col_a)
ml <- dplyr::rename(ml, Location = Col_c)
ml <- dplyr::rename(ml, setting = Col_e)
ml <- dplyr::rename(ml, Education = Col_g)
ml <- dplyr::rename(ml, Animal_Career = Col_h)
ml <- dplyr::rename(ml, Income = Col_i)
ml <- dplyr::rename(ml, C_Age = Col_j)
ml <- dplyr::rename(ml, C_Gender = Col_k)
ml <- dplyr::rename(ml, C_Diet = Col_l)
ml <- dplyr::rename(ml, Size = Col_o)
ml <- dplyr::rename(ml, D_Age = Col_p)
ml <- dplyr::rename(ml, D_Gender = Col_r)
ml <- dplyr::rename(ml, Therapeutic_Food = Col_s)
ml <- dplyr::rename(ml, Visits = Col_t)
ml <- dplyr::rename(ml, Vet_Health = Col_u)
ml <- dplyr::rename(ml, Meds = Col_ag)
ml <- dplyr::rename(ml, Health = Col_ah)
ml <- dplyr::rename(ml, D_Diet = Col_ai)
ml <- dplyr::rename(ml, Decision_Maker = Col_at)
head(ml)

Start by creating separate Sex and neuter status variables

ml$D_Sex <- rep("Male", nrow(ml))
for (i in 1:nrow(ml)) {
    if (grepl("Male", ml$D_Gender[i], fixed = TRUE)) {
        ml$D_Sex[i] <- "Male"
    } else {
        ml$D_Sex[i] <- "Female"
    }
}

# Checks and Balances
dplyr::count(ml, D_Sex)
ml$D_Neuter <- rep("BLANK", nrow(ml))
for (i in 1:nrow(ml)) {
    if (grepl("intact", ml$D_Gender[i], fixed = TRUE)) {
        ml$D_Neuter[i] <- "Intact"
    } else {
        ml$D_Neuter[i] <- "Neutered"
    }
}
dplyr::count(ml, D_Neuter)

check D_Age

dplyr::count(ml, D_Age)

Dog Age remove <1 years and unsure, then change to numeric

ml <- subset(ml, D_Age != "<1" & D_Age != "Unsure")
ml$D_Age <- as.numeric(ml$D_Age)
dplyr::count(ml, D_Age)

Initial Visualisation Excluding Health indicators with UMAP

uml <- ml

# Temporarily create the Owner Diet data labels for use
# later.

# Numeric -Euclidean
D_Age <- uml$D_Age
C_Age <- factor(uml$C_Age, ordered = TRUE, levels = c("18-19",
    "20–29", "30–39", "40–49", "50–59", "60–69", ">70"))

# Ordered FALSE since TOY is not strictly smaller than
# Small, but different.
Size <- factor(uml$Size, ordered = FALSE, levels = c("Toy (≤ 4.1 kg or ≤ 9 lb)",
    "Small (4.5–10.9 kg or 10–24 lb)", "Medium (11.4–22.3 kg or 25–49 lb)",
    "Large (22.7–40.5 kg or 50–89 lb)", "Giant (> 40.9 kg or > 90 lb)"))

uml$Education <- factor(uml$Education, ordered = TRUE, levels = c("Did not complete high school",
    "High school or equivalent", "College or University award lower than undergraduate degree",
    "University undergraduate degree", "University postgraduate degree lower than doctoral degree e.g. masters",
    "Doctoral degree"))

C_DietLabs <- uml$C_Diet
D_DietLabs <- uml$D_Diet
Edu_Labs <- uml$Education

Education <- uml$Education
# Not truely ordered because some 'prefer not to say'
Income <- as.factor(uml$Income)

# Factors even if binary - use Hamming
D_Sex <- as.factor(uml$D_Sex)
C_Gender <- as.factor(uml$C_Gender)
Neuter_Status <- as.factor(uml$D_Neuter)
Decision_Maker <- as.factor(uml$Decision_Maker)
setting <- as.factor(uml$setting)
Animal_Career <- as.factor(uml$Animal_Career)
Location <- as.factor(uml$Location)
D_Diet <- as.factor(uml$D_Diet)
C_Diet <- as.factor(uml$C_Diet)

# Reforming so we can use the explicitly set data types...
# Numeric Euclidean Ordered Manhattan Factor Hamming Factor
# Hamming Factor Hamming
Udata <- data.frame(cbind(D_Age, C_Age, Education, Size, Income,
    Neuter_Status, D_Sex, C_Gender, Animal_Career, C_Diet, D_Diet,
    setting, Decision_Maker, Location), stringsAsFactors = TRUE)  #Should be no strings as already factors
# Labels

# if using uwot
targets <- c(rep("euclidean", 1), rep("manhattan", 2), rep("hamming",
    11))

Udata$C_DietLabs <- C_DietLabs
uKeeper <- complete.cases(Udata)
Udata <- Udata[uKeeper, ]

CLabs <- Udata$C_DietLabs
Edu_Labs <- Edu_Labs[uKeeper]
# For other applications:
write.csv(Udata, "Udata.csv")
# Dont include the labels themselves in the UMAP
Udata <- dplyr::select(Udata, -C_DietLabs)

# nice-ify labels
CLabs <- stringr::str_replace(CLabs, "Vegetarian \\(consuming plants, eggs and milk\\)",
    "Vegetarian")
CLabs <- stringr::str_replace(CLabs, "Omnivore \\(i.e. eat meat\\)",
    "Omnivore")
CLabs <- stringr::str_replace(CLabs, "Omnivore reducing animal product consumption",
    "Omnivore \\(Reducing\\)")
CLabs <- stringr::str_replace(CLabs, "Vegan \\(consuming no animal products\\)",
    "Vegan")
CLabs <- stringr::str_replace(CLabs, "Pescatarian \\(including fish but no other meats\\)",
    "Pescatarian")
# Set up for doggie
Labs <- D_DietLabs[uKeeper]  #remember we complete cased above!

# nice-ify labels
Labs <- stringr::str_replace(Labs, "Meat-based – conventional",
    "Meat-based")
Labs <- stringr::str_replace(Labs, "Vegan \\(consuming no animal products\\)",
    "Vegan")
Labs <- stringr::str_replace(Labs, "Meat-based – lab-grown",
    "Lab-grown")
Labs <- stringr::str_replace(Labs, "Meat-based – raw", "Raw food")
Labs <- stringr::str_replace(Labs, "Vegetarian \\(including eggs or milk, but not meat\\)",
    "Vegetarian")

#Perform UMAP run

# Use UWOT library rather than UMAP since has built in
# categorical support hyperparameters, largely defaults

n_epochs = 50000
n_components = 2
n_n = 20
learning_rate = 0.01
scale = "maxabs"
spread = 1
min_dist = 0.001
set_op_mix_ratio = 1
local_con = 1
bandwidth = 1
repulsion_strength = 1
neg = 5
n_trees = 50
target_weight = 0.5
init = "normlaplacian"

set.seed(666)
um <- uwot::umap(Udata, target_metric = targets, spread = spread,
    n_neighbors = n_n, min_dist = min_dist, negative_sample_rate = neg,
    n_epochs = n_epochs, local_connectivity = local_con, n_trees = n_trees,
    scale = scale)
df = as.data.frame(cbind(um[, 1], um[, 2]))
names(df) <- c("x", "y")
ggplot(df, aes(x, y)) + geom_point(aes(fill = Labs), alpha = 0.8,
    shape = 21, size = 2, stroke = 0.2) + labs(fill = "Dog Diet") +
    xlab("UMAP X co-ordinates") + ylab("UMAP Y co-ordinates") +
    # geom_segment(aes(y=bottom,yend=top,x=-Inf,xend=-Inf))+
    # geom_segment(aes(y=-Inf,yend=-Inf,x=left,xend=right))+
    # scale_y_continuous(breaks=seq(bottom,top,2))+
    # scale_x_continuous(breaks=seq(left,r-ight,2))+
geom_rangeframe() + theme_tufte(base_family = "Gill Sans")

fname = paste0("umaps/", "i", i, "min_dist", min_dist, "_spread",
    spread, "_nn", n_n, "_lcon", local_con, "_neg", neg, "_epochs",
    n_epochs, "_trees", n_trees, "_", scale, ".png")
# Test Umap plot including hyperparameters ggsave(fname)

#Perform density clustering analysis ##And Dog clustering metrics

suppressWarnings(file.remove("dog_diet_clusters.csv"))
## [1] TRUE
set.seed(666)
db <- dbscan(um, eps = 0.7, minPts = 50)  #eps=1 minPts=5 db <- dbscan(um, eps=0.8, minPts = 25) dbscan(um, eps=0.86, minPts = 46)
print(db)
## DBSCAN clustering for 2609 objects.
## Parameters: eps = 0.7, minPts = 50
## Using euclidean distances and borderpoints = TRUE
## The clustering contains 6 cluster(s) and 0 noise points.
## 
##   1   2   3   4   5   6 
## 674 374 612 505 130 314 
## 
## Available fields: cluster, eps, minPts, dist, borderPoints
combi <- cbind(df, db$cluster, Labs)
names(combi) <- c("x", "y", "cluster", "Diet")
print(paste("total observations = ", nrow(combi)))
## [1] "total observations =  2609"
output <- dplyr::count(combi, Diet, name = "All Data")
print(output)
##           Diet All Data
## 1  Fungi-based        1
## 2 Insect-based        6
## 3    Lab-grown        7
## 4   Meat-based     1368
## 5      Mixture       16
## 6     Raw food      830
## 7       Unsure       10
## 8        Vegan      336
## 9   Vegetarian       35
write.table(output, "dog_diet_clusters.csv", sep = ",", row.names = FALSE,
    append = TRUE)
centeroids <- data.frame(mx = as.numeric(), my = as.numeric())
for (CLUSTER in seq(0, max(db$cluster))) {
    print(paste("cluster", CLUSTER))
    this_clust <- subset(combi, cluster == CLUSTER)
    print(paste("total observations in cluster", CLUSTER, " = ",
        nrow(this_clust)))
    output <- dplyr::count(this_clust, Diet, name = paste("Cluster",
        CLUSTER))
    print(output)
    write.table(output, "dog_diet_clusters.csv", sep = ",", row.names = FALSE,
        append = TRUE)

    if (CLUSTER > 0) {
        mx <- mean(this_clust$x)
        my <- mean(this_clust$y)
        lc <- data.frame(mx = mx, my = my)
        centeroids <- rbind(centeroids, lc)
        print(paste("Cluster", CLUSTER, "centeroid is", round(mx,
            2), round(my, 2)))
    }
}
## [1] "cluster 0"
## [1] "total observations in cluster 0  =  0"
## [1] Diet      Cluster 0
## <0 rows> (or 0-length row.names)
## [1] "cluster 1"
## [1] "total observations in cluster 1  =  674"
##         Diet Cluster 1
## 1  Lab-grown         3
## 2 Meat-based       571
## 3   Raw food       100
## [1] "Cluster 1 centeroid is -1.19 -5.04"
## [1] "cluster 2"
## [1] "total observations in cluster 2  =  374"
##         Diet Cluster 2
## 1    Mixture         9
## 2     Unsure         3
## 3      Vegan       331
## 4 Vegetarian        31
## [1] "Cluster 2 centeroid is 6.22 14.99"
## [1] "cluster 3"
## [1] "total observations in cluster 3  =  612"
##           Diet Cluster 3
## 1  Fungi-based         1
## 2 Insect-based         5
## 3    Lab-grown         2
## 4   Meat-based       424
## 5      Mixture         4
## 6     Raw food       176
## [1] "Cluster 3 centeroid is -0.14 2.71"
## [1] "cluster 4"
## [1] "total observations in cluster 4  =  505"
##           Diet Cluster 4
## 1 Insect-based         1
## 2    Lab-grown         2
## 3   Meat-based       277
## 4      Mixture         2
## 5     Raw food       221
## 6       Unsure         1
## 7   Vegetarian         1
## [1] "Cluster 4 centeroid is -4.79 -3.69"
## [1] "cluster 5"
## [1] "total observations in cluster 5  =  130"
##         Diet Cluster 5
## 1 Meat-based        96
## 2   Raw food        33
## 3     Unsure         1
## [1] "Cluster 5 centeroid is 0.52 -1.5"
## [1] "cluster 6"
## [1] "total observations in cluster 6  =  314"
##         Diet Cluster 6
## 1    Mixture         1
## 2   Raw food       300
## 3     Unsure         5
## 4      Vegan         5
## 5 Vegetarian         3
## [1] "Cluster 6 centeroid is 2.9 -5.75"

##Owner Diet clustering metrics

suppressWarnings(file.remove("owner_diet_clusters.csv"))
## [1] TRUE
combi <- cbind(df, db$cluster, CLabs)
names(combi) <- c("x", "y", "cluster", "C_Diet")
print(paste("total observations = ", nrow(combi)))
## [1] "total observations =  2609"
output <- dplyr::count(combi, C_Diet, name = "All Data")

print(output)
##                C_Diet All Data
## 1            Omnivore     1049
## 2 Omnivore (Reducing)      562
## 3               Other       20
## 4         Pescatarian      133
## 5               Vegan      581
## 6          Vegetarian      264
write.table(output, "owner_diet_clusters.csv", sep = ",", row.names = FALSE,
    append = TRUE)
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = FALSE, : appending column names to file
for (CLUSTER in seq(0, max(db$cluster))) {
    print(paste("cluster", CLUSTER))
    this_clust <- subset(combi, cluster == CLUSTER)
    print(paste("total observations in cluster", CLUSTER, " = ",
        nrow(this_clust)))
    output <- dplyr::count(this_clust, C_Diet, name = paste("Cluster",
        CLUSTER))
    print(output)
    write.table(output, "owner_diet_clusters.csv", sep = ",",
        row.names = TRUE, append = TRUE)
}
## [1] "cluster 0"
## [1] "total observations in cluster 0  =  0"
## [1] C_Diet    Cluster 0
## <0 rows> (or 0-length row.names)
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 1"
## [1] "total observations in cluster 1  =  674"
##                C_Diet Cluster 1
## 1            Omnivore       434
## 2 Omnivore (Reducing)       235
## 3               Other         4
## 4         Pescatarian         1
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 2"
## [1] "total observations in cluster 2  =  374"
##                C_Diet Cluster 2
## 1            Omnivore         2
## 2 Omnivore (Reducing)         9
## 3               Other         1
## 4         Pescatarian         9
## 5               Vegan       331
## 6          Vegetarian        22
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 3"
## [1] "total observations in cluster 3  =  612"
##        C_Diet Cluster 3
## 1       Other         1
## 2 Pescatarian       119
## 3       Vegan       250
## 4  Vegetarian       242
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 4"
## [1] "total observations in cluster 4  =  505"
##                C_Diet Cluster 4
## 1            Omnivore       333
## 2 Omnivore (Reducing)       166
## 3               Other         6
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 5"
## [1] "total observations in cluster 5  =  130"
##                C_Diet Cluster 5
## 1            Omnivore        66
## 2 Omnivore (Reducing)        57
## 3               Other         5
## 4         Pescatarian         2
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file
## [1] "cluster 6"
## [1] "total observations in cluster 6  =  314"
##                C_Diet Cluster 6
## 1            Omnivore       214
## 2 Omnivore (Reducing)        95
## 3               Other         3
## 4         Pescatarian         2
## Warning in write.table(output, "owner_diet_clusters.csv", sep = ",", row.names
## = TRUE, : appending column names to file

##Owner Education clustering metrics

suppressWarnings(file.remove("owner_edu_clusters.csv"))
## [1] TRUE
combi <- cbind(df, db$cluster, Edu_Labs)
names(combi) <- c("x", "y", "cluster", "Education")
print(paste("total observations = ", nrow(combi)))
## [1] "total observations =  2609"
output <- dplyr::count(combi, Education, name = "All Data")

print(output)
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   All Data
## 1       42
## 2      478
## 3      746
## 4      740
## 5      511
## 6       92
write.table(output, "owner_edu_clusters.csv", sep = ",", row.names = FALSE,
    append = TRUE)
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## FALSE, : appending column names to file
for (CLUSTER in seq(0, max(db$cluster))) {
    print(paste("cluster", CLUSTER))
    this_clust <- subset(combi, cluster == CLUSTER)
    print(paste("total observations in cluster", CLUSTER, " = ",
        nrow(this_clust)))
    output <- dplyr::count(this_clust, Education, name = paste("Cluster",
        CLUSTER))
    print(output)

    write.table(output, "owner_edu_clusters.csv", sep = ",",
        row.names = TRUE, append = TRUE)
}
## [1] "cluster 0"
## [1] "total observations in cluster 0  =  0"
## [1] Education Cluster 0
## <0 rows> (or 0-length row.names)
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 1"
## [1] "total observations in cluster 1  =  674"
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   Cluster 1
## 1        10
## 2       143
## 3       199
## 4       182
## 5       121
## 6        19
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 2"
## [1] "total observations in cluster 2  =  374"
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   Cluster 2
## 1         3
## 2        51
## 3        74
## 4       125
## 5       101
## 6        20
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 3"
## [1] "total observations in cluster 3  =  612"
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   Cluster 3
## 1         5
## 2       118
## 3       160
## 4       152
## 5       152
## 6        25
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 4"
## [1] "total observations in cluster 4  =  505"
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   Cluster 4
## 1        14
## 2        97
## 3       181
## 4       139
## 5        66
## 6         8
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 5"
## [1] "total observations in cluster 5  =  130"
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   Cluster 5
## 1         2
## 2        10
## 3        29
## 4        50
## 5        26
## 6        13
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file
## [1] "cluster 6"
## [1] "total observations in cluster 6  =  314"
##                                                                Education
## 1                                           Did not complete high school
## 2                                              High school or equivalent
## 3            College or University award lower than undergraduate degree
## 4                                        University undergraduate degree
## 5 University postgraduate degree lower than doctoral degree e.g. masters
## 6                                                        Doctoral degree
##   Cluster 6
## 1         8
## 2        59
## 3       103
## 4        92
## 5        45
## 6         7
## Warning in write.table(output, "owner_edu_clusters.csv", sep = ",", row.names =
## TRUE, : appending column names to file

#Pull out the centeroids

p <- ggplot(df, aes(x, y)) + geom_point(aes(fill = Labs), alpha = 0.8,
    shape = 21, size = 2, stroke = 0.2) + labs(fill = "Dog Diet") +
    xlab("UMAP X co-ordinates") + ylab("UMAP Y co-ordinates") +
    # geom_segment(aes(y=bottom,yend=top,x=-Inf,xend=-Inf))+
    # geom_segment(aes(y=-Inf,yend=-Inf,x=left,xend=right))+
    # scale_y_continuous(breaks=seq(bottom,top,2))+
    # scale_x_continuous(breaks=seq(left,r-ight,2))+
geom_rangeframe() + theme_tufte(base_family = "Gill Sans") +
    geom_mark_circle(data = centeroids, aes(x = mx, y = my, group = rownames(centeroids)),
        fill = "white", alpha = 0.7, color = "black", con.type = "none",
        label.fill = "white", show.legend = FALSE, inherit.aes = FALSE) +
    geom_text(data = centeroids, aes(mx, my, label = rownames(centeroids)),
        size = 10, color = "black")

fname = paste0("umaps/", "i", i, "min_dist", min_dist, "_spread",
    spread, "_nn", n_n, "_lcon", local_con, "_neg", neg, "_epochs",
    n_epochs, "_trees", n_trees, "_", scale, ".png")
print(p)
## Warning: Using the `size` aesthetic in this geom was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` in the `default_aes` field and elsewhere instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

# ggsave(fname)

#Calculate silhoutte and cluster metric

set.seed(1234)

sil <- silhouette(db$cluster, dist(dplyr::select(combi, x, y)))
sil <- fviz_silhouette(sil)
##   cluster size ave.sil.width
## 1       1  674          0.35
## 2       2  374          0.92
## 3       3  612          0.64
## 4       4  505          0.59
## 5       5  130          0.85
## 6       6  314          0.80
sil_df <- sil$data
output <- data.frame(Cluster = as.numeric(), Score = as.numeric(),
    mx = as.numeric(), my = as.numeric())
mx <- 0
snx <- 0
for (cluster in unique(sil_df$cluster)) {
    c <- as.character(cluster)
    Score <- mean(subset(sil_df, cluster == c)$sil_width)
    my <- Score
    # halfway long the cluster
    nx <- nrow(subset(sil_df, cluster == c))
    mx <- snx + nx/2
    snx <- snx + nx

    print(paste(cluster, round(Score, 3), nx))
    local <- data.frame(Cluster = as.numeric(as.character(cluster)),
        Score = round(Score, 3), mx = mx, my = my)
    output <- rbind(output, local)
}
## [1] "1 0.347 674"
## [1] "2 0.92 374"
## [1] "3 0.639 612"
## [1] "4 0.593 505"
## [1] "5 0.853 130"
## [1] "6 0.799 314"
head(output)
write.csv(output, "sils.csv")
output <- subset(output, Cluster > 0)
p <- sil + geom_mark_circle(data = output, aes(x = mx, y = my,
    group = rownames(output)), fill = "white", alpha = 0.7, color = "black",
    con.type = "none", label.fill = "white", show.legend = FALSE,
    inherit.aes = FALSE) + geom_text(data = output, aes(mx, my,
    label = round(Score, 2)), size = 3, color = "black")
print(p)

ggsave(paste0(figures, "fviz_silhouette.tiff"), width = 5, height = 4,
    dpi = 300, compression = "lzw")

#Davies-Bouldin

result <- index.DB(dplyr::select(combi, x, y), combi$cluster,
    centrotypes = "centroids")

# print the Davies-Bouldin index
print(paste("Davies-Bouldin index is ", round(result$DB, 4)))
## [1] "Davies-Bouldin index is  0.547"

UMAP by dog diet

Labs <- D_DietLabs[uKeeper]  #remember we complete cased above!

# nice-ify labels
Labs <- stringr::str_replace(Labs, "Meat-based – conventional",
    "Meat-based")
Labs <- stringr::str_replace(Labs, "Vegan \\(consuming no animal products\\)",
    "Vegan")
Labs <- stringr::str_replace(Labs, "Meat-based – lab-grown",
    "Lab-grown")
Labs <- stringr::str_replace(Labs, "Meat-based – raw", "Raw food")
Labs <- stringr::str_replace(Labs, "Vegetarian \\(including eggs or milk, but not meat\\)",
    "Vegetarian")

top = 8
bottom = -6
left = -6
right = 10
ggplot(df, aes(x, y)) + geom_point(aes(fill = Labs), alpha = 0.8,
    shape = 21, size = 3, stroke = 0.2) + labs(fill = "Dog Diet") +
    xlab("UMAP X co-ordinates") + ylab("UMAP Y co-ordinates") +
    geom_segment(aes(y = bottom, yend = top, x = -Inf, xend = -Inf)) +
    geom_segment(aes(y = -Inf, yend = -Inf, x = left, xend = right)) +
    scale_fill_manual(values = rainbow(9)) + scale_y_continuous(breaks = seq(bottom,
    top, 2)) + scale_x_continuous(breaks = seq(left, right, 2)) +
    geom_rangeframe() + theme_tufte(base_family = "Gill Sans",
    base_size = 18) + theme(legend.title = element_text(size = 18),
    legend.text = element_text(size = 12)) + geom_rangeframe() +
    geom_mark_circle(data = centeroids, aes(x = mx, y = my, group = rownames(centeroids)),
        fill = "white", alpha = 0.3, color = "black", con.type = "none",
        label.fill = "white", show.legend = FALSE, inherit.aes = FALSE) +
    geom_text(data = centeroids, aes(mx, my, label = rownames(centeroids)),
        size = 8, color = "black")

ggsave(paste0(figures, "UMAP_Clear_Dog.TIFF"), compression = "lzw")

Initial Visualisation Excluding Health indicators with UMAP by extra metadata, for SI

cropbraket <- function(mystring) {
    newstring <- unlist(str_split(mystring, " \\("))[1]
}

plotum <- function(labels, lcat, df) {
    # number of legend rows
    lrow = 2
    if (length(unique(labels)) > 3) {
        lrow = 3
    }
    if (length(unique(labels)) > 6) {
        lrow = 4
    }

    ldf <- cbind(df, labels)
    ldf <- ldf[complete.cases(ldf), ]
    ldf <- ldf[order(ldf[, "labels"]), ]
    labels <- ldf$labels

    # Pretty the legend key
    labels <- str_replace(labels, "Prefer not to answer", "Not given")
    if (lcat == "D_Age")
        lcat = "Dog Age"
    if (lcat == "C_Age")
        lcat = "Owner Age"
    if (lcat == "Neuter_Status")
        lcat = "Neuter Status"
    if (lcat == "D_Sex")
        lcat = "Dog Sex"
    if (lcat == "C_Gender")
        lcat = "Owner Gender"
    if (lcat == "C_Diet") {
        lcat = "Owner Diet"
        # nice-ify labels
        labels <- stringr::str_replace(labels, "Vegetarian \\(consuming plants, eggs and milk\\)",
            "Vegetarian")
        labels <- stringr::str_replace(labels, "Omnivore \\(i.e. eat meat\\)",
            "Omnivore")
        labels <- stringr::str_replace(labels, "Omnivore reducing animal product consumption",
            "Reduced Meat")
        labels <- stringr::str_replace(labels, "Vegan \\(consuming no animal products\\)",
            "Vegan")
        labels <- stringr::str_replace(labels, "Pescatarian \\(including fish but no other meats\\)",
            "Pescatarian")
        order <- as.numeric(labels == "Vegan")
        ldf$order <- order
        ldf <- ldf[order(ldf[, "order"]), ]

        # Try and get the colours to match the Dog equiv
        tempcols <- (scales::hue_pal())(n = 9)
        colours <- tempcols[4:9]
    }

    if (lcat == "D_Diet")
        return(NULL)

    if (lcat == "Animal_Career") {
        lcat = "Animal Career"
        labels <- str_replace(labels, "None of the above", "None")
        labels <- str_replace(labels, "Pet industry worker",
            "Pet industry")
        labels <- str_replace(labels, "Animal trainer", "Trainer")
        labels <- str_replace(labels, "Veterinary technician or nurse",
            "Vet Tech/Nurse")
        labels <- str_replace(labels, "Animal breeder", "Breeder")
    }

    if (lcat == "Size")
        labels <- sapply(labels, cropbraket)

    if (lcat == "Decision_Maker") {
        lcat = "Decision Maker"
        labels <- str_replace(labels, "Primary decision-maker",
            "Primary")
        labels <- str_replace(labels, "Play some lesser role",
            "Some Role")
        labels <- str_replace(labels, "Play no role", "No Role")
    }

    if (lcat == "Location") {
        labels <- str_replace(labels, "North America", "N. America")
        labels <- str_replace(labels, "Other European", "Other Euro.")
        labels <- str_replace(labels, "Australia/New Zealand/Oceania",
            "Oceania")
    }

    if (lcat == "setting") {
        lcat = "Setting"
        labels <- str_replace(labels, "Equally urban and rural",
            "Equal U & R")
    }

    if (lcat == "Education") {
        labels <- str_replace(labels, "Did not complete high school",
            "0_Basic")
        labels <- str_replace(labels, "High school or equivalent",
            "1_HighSchool")
        labels <- str_replace(labels, "College or University award lower than undergraduate degree",
            "2_College")
        labels <- str_replace(labels, "University undergraduate degree",
            "3_Grad")
        labels <- str_replace(labels, "University postgraduate degree lower than doctoral degree e.g. masters",
            "4_PG")
        labels <- str_replace(labels, "Doctoral degree", "5_PhD")
    }

    labels <- as.factor(str_trunc(labels, 12, "right", ""))
    if (lcat == "Dog Age")
        labels <- as.numeric(labels)
    # ldf <- dplyr::select(ldf,-labels)

    # Put the labels back in!
    ldf$labels <- labels

    # remove this version to remove confusion
    rm(labels)

    top = 8
    bottom = -6
    left = -6
    right = 10
    p <- ggplot()
    if (lcat == "Dog Diet") {
        p <- p + geom_point(data = subset(ldf, !(labels %in%
            c("Vegan"))), aes(x = x, y = y, fill = labels), alpha = 0.8,
            shape = 21, size = 3, stroke = 0.2) + geom_point(data = subset(ldf,
            labels == "Vegan"), aes(x = x, y = y, fill = labels),
            alpha = 0.8, shape = 21, size = 3, stroke = 0.2)
    } else if (lcat == "Decision Maker") {
        p <- p + geom_point(data = subset(ldf, !(labels %in%
            c("No Role", "Some Role"))), aes(x = x, y = y, fill = labels),
            alpha = 0.8, shape = 21, size = 3, stroke = 0.2) +
            geom_point(data = subset(ldf, labels == "No Role"),
                aes(x = x, y = y, fill = labels), alpha = 0.8,
                shape = 21, size = 3, stroke = 0.2) + geom_point(data = subset(ldf,
            labels == "Some Role"), aes(x = x, y = y, fill = labels),
            alpha = 0.8, shape = 21, size = 3, stroke = 0.2)
    } else {
        p <- p + geom_point(data = ldf, aes(x = x, y = y, fill = labels),
            alpha = 0.8, shape = 21, size = 3, stroke = 0.2)
    }

    p <- p + labs(fill = lcat) + xlab("UMAP X co-ordinates") +
        ylab("UMAP Y co-ordinates")
    if (lcat == "Dog Age") {
        p <- p + scale_fill_gradientn(colors = rainbow(8))
    } else if (lcat == "Owner Diet") {
        p <- p + scale_fill_manual(values = colours, limits = unique(ldf$labels))
    }
    p <- p + geom_segment(aes(y = bottom, yend = top, x = -Inf,
        xend = -Inf)) + geom_segment(aes(y = -Inf, yend = -Inf,
        x = left, xend = right)) + scale_y_continuous(breaks = seq(bottom,
        top, 2)) + scale_x_continuous(breaks = seq(left, right,
        2)) + geom_rangeframe() + theme_tufte(base_family = "Gill Sans",
        base_size = 18) + theme(legend.title = element_text(size = 18),
        legend.text = element_text(size = 12)) + geom_mark_circle(data = centeroids,
        aes(x = mx, y = my, group = rownames(centeroids)), fill = "white",
        alpha = 0.3, color = "black", con.type = "none", label.fill = "white",
        show.legend = FALSE, inherit.aes = FALSE) + geom_text(data = centeroids,
        aes(mx, my, label = rownames(centeroids)), size = 8,
        color = "black")

    print(p)
    ggsave(paste0(figures, "UMAP_", lcat, ".TIFF"), compression = "lzw")
    return(ldf)
}

for (label in names(Udata)) {
    print(paste("starting", label))

    if (label == "Neuter_Status") {
        plotum(uml[["D_Neuter"]][uKeeper], label, df)
    } else {
        DF <- plotum(uml[[label]][uKeeper], label, df)
    }

    print(paste("completed", label))
}
## [1] "starting D_Age"

## [1] "completed D_Age"
## [1] "starting C_Age"

## [1] "completed C_Age"
## [1] "starting Education"

## [1] "completed Education"
## [1] "starting Size"

## [1] "completed Size"
## [1] "starting Income"

## [1] "completed Income"
## [1] "starting Neuter_Status"

## [1] "completed Neuter_Status"
## [1] "starting D_Sex"

## [1] "completed D_Sex"
## [1] "starting C_Gender"

## [1] "completed C_Gender"
## [1] "starting Animal_Career"

## [1] "completed Animal_Career"
## [1] "starting C_Diet"

## [1] "completed C_Diet"
## [1] "starting D_Diet"
## [1] "completed D_Diet"
## [1] "starting setting"

## [1] "completed setting"
## [1] "starting Decision_Maker"

## [1] "completed Decision_Maker"
## [1] "starting Location"

## [1] "completed Location"

Create ml data frame

ml <- dplyr::select(ml, c(unique.resp.number, Location, setting,
    Education, Animal_Career, Income, C_Age, C_Gender, C_Diet,
    Size, D_Age, D_Gender, D_Sex, D_Neuter, Therapeutic_Food,
    Visits, Vet_Health, Meds, Health, D_Diet, Decision_Maker))

# check columns
head(ml)

Categorise Fog Age into quintiles

ml <- subset(ml, D_Age != "NA")
ml$D_Age_quant <- cut(ml$D_Age, breaks = quantile(ml$D_Age, probs = seq(0,
    1, 0.2)), na.rm = TRUE, include.lowest = TRUE)
dplyr::count(ml, D_Age_quant)

Check Sex

ml$D_Sex <- as.factor(ml$D_Sex)
# Checks and Balances
dplyr::count(ml, D_Sex)

Check Neuter

ml$D_Neuter <- as.factor(ml$D_Neuter)
# Checks and Balances
dplyr::count(ml, D_Neuter)

Check Location

dplyr::count(ml, Location)

combine smaller groups into logical categories

for (i in 1:nrow(ml)) {
    if (grepl("UK", ml$Location[i], fixed = TRUE)) {
        ml$Location[i] <- "UK"
    } else if (grepl("Other European", ml$Location[i], fixed = TRUE)) {
        ml$Location[i] <- "Other European"
    } else if (grepl("North America", ml$Location[i], fixed = TRUE)) {
        ml$Location[i] <- "North America"
    } else if (grepl("Australia/New Zealand/Oceania", ml$Location[i],
        fixed = TRUE)) {
        ml$Location[i] <- "Australia/New Zealand/Oceania"
    } else {
        ml$Location[i] <- "Other"
    }
}
ml$Location <- factor(ml$Location, levels = c("UK", "Other European",
    "North America", "Australia/New Zealand/Oceania", "Other"))
dplyr::count(ml, Location)

create Location_Europe binary

# Create a binary Loaction_Europe variable
ml$Location_Europe <- ml$Location
ml$Location_Europe <- as.character(ml$Location_Europe)
for (i in 1:nrow(ml)) {
    if (grepl("Other European", ml$Location_Europe[i], fixed = TRUE)) {
        ml$Location_Europe[i] <- "Yes"
    } else {
        ml$Location_Europe[i] <- "No"
    }
}
dplyr::count(ml, Location_Europe)

check setting

dplyr::count(ml, setting)

Not much point including data where setting is “other”

ml <- subset(ml, setting != "Other" | setting != NA)
ml$setting <- as.factor(ml$setting)
dplyr::count(ml, setting)

Reorder SETTING and check

# reorder categories so Urban is REF
ml$setting <- factor(ml$setting, levels = c("Urban", "Rural",
    "Equally urban and rural"))
dplyr::count(ml, setting)

create binary for URBAN

# Create a binary URBAN variable
ml$Urban <- ml$setting
ml$Urban <- as.character(ml$Urban)
for (i in 1:nrow(ml)) {
    if (grepl("Urban", ml$Urban[i], fixed = TRUE)) {
        ml$Urban[i] <- "Yes"
    } else {
        ml$Urban[i] <- "No"
    }
}
dplyr::count(ml, Urban)

Check Education

dplyr::count(ml, Education)

Simplify education and order categories

Qualifications = c("0_Basic", "1_HighSchool", "2_College", "3_Grad",
    "4_PG", "5_PhD")
ml$Education <- str_replace(ml$Education, "Did not complete high school",
    "0_Basic")
ml$Education <- str_replace(ml$Education, "High school or equivalent",
    "1_HighSchool")
ml$Education <- str_replace(ml$Education, "College or University award lower than undergraduate degree",
    "2_College")
ml$Education <- str_replace(ml$Education, "University undergraduate degree",
    "3_Grad")
ml$Education <- str_replace(ml$Education, "University postgraduate degree lower than doctoral degree e.g. masters",
    "4_PG")
ml$Education <- str_replace(ml$Education, "Doctoral degree",
    "5_PhD")
ml$Education <- ordered(ml$Education)
dplyr::count(ml, Education)

#Simplify education further as small group sizes

Qualifications_S = c("0_Basic_or_HighSchool", "1_College", "2_Grad",
    "3_PG_or_PhD")
ml$Education_S <- ml$Education
ml$Education_S <- str_replace(ml$Education_S, "0_Basic", "0_Basic_or_HighSchool")
ml$Education_S <- str_replace(ml$Education_S, "1_HighSchool",
    "0_Basic_or_HighSchool")
ml$Education_S <- str_replace(ml$Education_S, "2_College", "1_College")
ml$Education_S <- str_replace(ml$Education_S, "3_Grad", "2_Grad")
ml$Education_S <- str_replace(ml$Education_S, "4_PG", "3_PG_or_PhD")
ml$Education_S <- str_replace(ml$Education_S, "5_PhD", "3_PG_or_PhD")
dplyr::count(ml, Education_S)

Try non-ordered categories for Education_S

ml$Education_S2 <- factor(ml$Education_S, ordered = FALSE)
dplyr::count(ml, Education_S2)

check Animal_Career variable

dplyr::count(ml, Animal_Career)

create animal career variable where vet combined

ml$Animal_Career2 <- ml$Animal_Career
for (i in 1:nrow(ml)) {
    if (grepl("Vet", ml$Animal_Career2[i], fixed = TRUE)) {
        ml$Animal_Career2[i] <- "Vet professional"
    } else if (grepl("Animal breeder", ml$Animal_Career2[i], fixed = TRUE)) {
        ml$Animal_Career2[i] <- "breeder/trainer"
    } else if (grepl("Animal trainer", ml$Animal_Career2[i], fixed = TRUE)) {
        ml$Animal_Career2[i] <- "breeder/trainer"
    } else if (grepl("Pet industry worker", ml$Animal_Career2[i],
        fixed = TRUE)) {
        ml$Animal_Career2[i] <- "Pet industry worker"
    } else {
        ml$Animal_Career2[i] <- "None of the above"
    }
}
# reorder categories so None of the above is REF
ml$Animal_Career2 <- factor(ml$Animal_Career2, levels = c("None of the above",
    "Vet professional", "breeder/trainer", "Pet industry worker"))
dplyr::count(ml, Animal_Career2)

create binary variable for animal career as ensures better balance

ml$Animal_Career_BINARY <- ml$Animal_Career

for (i in 1:nrow(ml)) {
    if (grepl("None of the above", ml$Animal_Career_BINARY[i],
        fixed = TRUE)) {
        ml$Animal_Career_BINARY[i] <- "No"
    } else {
        ml$Animal_Career_BINARY[i] <- "Yes"
    }
}
dplyr::count(ml, Animal_Career_BINARY)

Check Income

dplyr::count(ml, Income)

Two choice with Income; treat as factor or drop “prefer not to say” and treat as ordinal.

ml <- subset(ml, Income != "Prefer not to answer" | Income !=
    NA)
ml$Income <- ordered(ml$Income, levels = c("Low", "Medium", "High"))
dplyr::count(ml, Income)

Try non-ordered categories for Income

ml$Income2 <- factor(ml$Income, ordered = FALSE)
dplyr::count(ml, Income2)

Check C_Age

dplyr::count(ml, C_Age)

combine smallest categories and reorder

ml$C_Age <- str_replace(ml$C_Age, "18-19", "<30")
ml$C_Age <- str_replace(ml$C_Age, "20–29", "<30")
ml$C_Age <- str_replace(ml$C_Age, ">70", "60<")
ml$C_Age <- str_replace(ml$C_Age, "60–69", "60<")
ml$C_Age <- ordered(ml$C_Age)
# Check right order
ml$C_Age[nrow(ml)]
## [1] 60<
## Levels: <30 < 30–39 < 40–49 < 50–59 < 60<
dplyr::count(ml, C_Age)

Try non-ordered categories for c_Age

ml$C_Age2 <- factor(ml$C_Age, ordered = FALSE)
dplyr::count(ml, C_Age2)

Check C_Gender

dplyr::count(ml, C_Gender)

Given small numbers in some categories, create male/female subset for C_Gender

ml <- subset(ml, C_Gender == "Male" | C_Gender == "Female")
ml$C_Gender <- as.factor(ml$C_Gender)
dplyr::count(ml, C_Gender)

Check C_Diet

dplyr::count(ml, C_Diet)

Not much point including other category as small numbers

ml <- subset(ml, C_Diet != "Other" | C_Diet != NA)
dplyr::count(ml, C_Diet)

create binary C_Vegan variable

ml$C_Diet_Vegan <- ml$C_Diet
for (i in 1:nrow(ml)) {
    if (grepl("Vegan", ml$C_Diet_Vegan[i], fixed = TRUE)) {
        ml$C_Diet_Vegan[i] <- "Yes"
    } else {
        ml$C_Diet_Vegan[i] <- "No"
    }
}
dplyr::count(ml, C_Diet_Vegan)

create binary C_Vegan_Veggie variable

ml$C_Diet_Vegan_Veggie <- ml$C_Diet
for (i in 1:nrow(ml)) {
    if (grepl("Veg", ml$C_Diet_Vegan_Veggie[i], fixed = TRUE)) {
        ml$C_Diet_Vegan_Veggie[i] <- "Yes"
    } else {
        ml$C_Diet_Vegan_Veggie[i] <- "No"
    }
}
dplyr::count(ml, C_Diet_Vegan_Veggie)

reorder and rename size categories

for (i in 1:nrow(ml)) {
    if (grepl("Toy", ml$Size[i], fixed = TRUE)) {
        ml$Size[i] <- "Toy"
    } else if (grepl("Small", ml$Size[i], fixed = TRUE)) {
        ml$Size[i] <- "Small"
    } else if (grepl("Medium", ml$Size[i], fixed = TRUE)) {
        ml$Size[i] <- "Medium"
    } else if (grepl("Large", ml$Size[i], fixed = TRUE)) {
        ml$Size[i] <- "Large"
    } else {
        ml$Size[i] <- "Giant"
    }
}
ml$Size <- ordered(ml$Size, levels = c("Toy", "Small", "Medium",
    "Large", "Giant"))
dplyr::count(ml, Size)

Try non-ordered size categories with medium as reference

ml$Size2 <- factor(ml$Size, levels = c("Medium", "Toy", "Small",
    "Large", "Giant"), ordered = FALSE)
dplyr::count(ml, Size2)

create binary Giant breed variable

Giant = c("Yes", "No")
ml$Size_Giant <- ml$Size2
ml$Size_Giant <- str_replace(ml$Size_Giant, "Giant", "Yes")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Toy", "No")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Medium", "No")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Small", "No")
ml$Size_Giant <- str_replace(ml$Size_Giant, "Large", "No")
dplyr::count(ml, Size_Giant)

create theraeputic food category

ml$Therapeutic_Food <- as.character(ml$Therapeutic_Food)
ml <- subset(ml, Therapeutic_Food != "NA" | Therapeutic_Food !=
    NA)
dplyr::count(ml, Therapeutic_Food)

check visit data

ml <- subset(ml, Visits != "NA")
dplyr::count(ml, Visits)

Not much point including data where clients are unsure of visit numbers.

ml <- subset(ml, Visits != "Unsure" | Visits != NA)
ml <- subset(ml, Visits != "NA" | Visits != NA)
dplyr::count(ml, Visits)

rename and reorder Visits

ml$Visits <- str_replace(ml$Visits, ">3", "3<")
ml$Visits <- str_replace(ml$Visits, "0", "0")
ml$Visits <- str_replace(ml$Visits, "1", "1")
ml$Visits <- str_replace(ml$Visits, "2", "2")
ml$Visits <- str_replace(ml$Visits, "3", "3")
dplyr::count(ml, ml$Visits)

Simplify as Visits2 to ensure proportionality

ml$Visits2 <- ml$Visits
ml$Visits2 <- str_replace(ml$Visits2, "3<", "3")
ml$Visits2 <- str_replace(ml$Visits2, "3", "3v")
ml$Visits2 <- str_replace(ml$Visits2, "0", "0v")
ml$Visits2 <- str_replace(ml$Visits2, "1", "1v")
ml$Visits2 <- str_replace(ml$Visits2, "2", "2v")

dplyr::count(ml, ml$Visits2)

check Meds

ml$Meds <- as.factor(ml$Meds)
dplyr::count(ml, Meds)

check Health category

dplyr::count(ml, ml$Health)

Tidy up Health category

Not much point including data where clients unsure of pet health.

Also need to reorder categories

ml <- subset(ml, Health != "Unsure" | Health != NA)
ml$Health <- as.factor(ml$Health)
ml$Health <- ordered(ml$Health, levels = c("Healthy", "Generally healthy with minor or infrequent problems",
    "Significant or frequent problems", "Seriously ill"))
dplyr::count(ml, Health)

create Health2 variable where serious and significant are combined

Health_severity = c("0_None", "1_Mild", "2_Significant")
ml$Health2 <- ml$Health
ml$Health2 <- str_replace(ml$Health2, "Healthy", "0_None")
ml$Health2 <- str_replace(ml$Health2, "Generally healthy with minor or infrequent problems",
    "1_Mild")
ml$Health2 <- str_replace(ml$Health2, "Significant or frequent problems",
    "2_Significant")
ml$Health2 <- str_replace(ml$Health2, "Seriously ill", "2_Significant")
ml$Health2 <- ordered(ml$Health2, levels = c("0_None", "1_Mild",
    "2_Significant"))
dplyr::count(ml, ml$Health2)

create binary Health variable

bhealth <- rep(0, nrow(ml))
for (i in 1:nrow(ml)) {
    if (grepl("healthy", ml$Health[i], ignore.case = TRUE)) {
        # 0 is good health
        bhealth[i] <- 0
    } else {
        bhealth[i] <- 1
    }
}
# Keep as a variable not factor!
ml$Health_Binary <- as.numeric(bhealth)
rm(bhealth)
# Checks and Balances, hey it's VERY unbalanced
dplyr::count(ml, Health_Binary)

create binary Any health problem variable

bhealth2 <- rep(0, nrow(ml))
for (i in 1:nrow(ml)) {
    if (grepl("0_None", ml$Health2[i], ignore.case = TRUE)) {
        # 0 is good health
        bhealth2[i] <- 0
    } else {
        bhealth2[i] <- 1
    }
}
# Keep as a variable not factor!
ml$Any_Health_Problem <- as.numeric(bhealth2)
rm(bhealth2)
# Checks and Balances, better balanced this time.
dplyr::count(ml, Any_Health_Problem)

check D_Diet

dplyr::count(ml, D_Diet)

Not much point including those that are unsure.

ml <- subset(ml, D_Diet != "Unsure" | D_Diet != NA)
dplyr::count(ml, D_Diet)

Not much point including diets where there are small numbers

ml <- subset(ml, D_Diet != "Mixture" | D_Diet != NA)
ml <- subset(ml, D_Diet != "Insect-based" | D_Diet != NA)
ml <- subset(ml, D_Diet != "Meat-based – lab-grown" | D_Diet !=
    NA)
dplyr::count(ml, D_Diet)

create binary D_Vegan variable

ml$D_Diet_Vegan <- ml$D_Diet
for (i in 1:nrow(ml)) {
    if (grepl("Vegan", ml$D_Diet_Vegan[i], fixed = TRUE)) {
        ml$D_Diet_Vegan[i] <- "Yes"
    } else {
        ml$D_Diet_Vegan[i] <- "No"
    }
}
dplyr::count(ml, D_Diet_Vegan)

create binary D_Raw variable

ml$D_Diet_Raw <- ml$D_Diet
for (i in 1:nrow(ml)) {
    if (grepl("raw", ml$D_Diet_Raw[i], fixed = TRUE)) {
        ml$D_Diet_Raw[i] <- "Yes"
    } else {
        ml$D_Diet_Raw[i] <- "No"
    }
}
dplyr::count(ml, D_Diet_Raw)

create binary D_Vegan_Veggie variable

ml$D_Diet_Vegan_Veggie <- ml$D_Diet
for (i in 1:nrow(ml)) {
    if (grepl("Veg", ml$D_Diet_Vegan_Veggie[i], fixed = TRUE)) {
        ml$D_Diet_Vegan_Veggie[i] <- "Yes"
    } else {
        ml$D_Diet_Vegan_Veggie[i] <- "No"
    }
}
dplyr::count(ml, D_Diet_Vegan_Veggie)

check Decision_maker

dplyr::count(ml, Decision_Maker)

Better only to include data from clients that were primary decision makers as likely to be more accurate.

ml <- subset(ml, Decision_Maker == "Primary decision-maker")
dplyr::count(ml, Decision_Maker)

Let’s just keep those we will need

ml <- subset(ml, select = c("Location", "setting", "Urban", "Education",
    "Education_S", "Education_S2", "Animal_Career", "Animal_Career2",
    "Animal_Career_BINARY", "Income", "Income2", "C_Age", "C_Age2",
    "C_Gender", "C_Diet", "C_Diet_Vegan", "C_Diet_Vegan_Veggie",
    "D_Age", "D_Age_quant", "Size", "Size2", "Size_Giant", "D_Sex",
    "D_Neuter", "Therapeutic_Food", "Meds", "D_Diet", "D_Diet_Vegan",
    "D_Diet_Vegan_Veggie", "D_Diet_Raw", "Visits", "Visits2",
    "Health", "Health2", "Health_Binary", "Any_Health_Problem",
    "Location_Europe"))

# check columns
head(ml)

summary data

describe(ml)
## ml 
## 
##  37  Variables      2211  Observations
## --------------------------------------------------------------------------------
## Location 
##        n  missing distinct 
##     2211        0        5 
## 
## UK (1609, 0.728), Other European (318, 0.144), North America (119, 0.054),
## Australia/New Zealand/Oceania (97, 0.044), Other (68, 0.031)
## --------------------------------------------------------------------------------
## setting 
##        n  missing distinct 
##     2211        0        3 
##                                                           
## Value                        Urban                   Rural
## Frequency                      780                     728
## Proportion                   0.353                   0.329
##                                   
## Value      Equally urban and rural
## Frequency                      703
## Proportion                   0.318
## --------------------------------------------------------------------------------
## Urban 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1431   780
## Proportion 0.647 0.353
## --------------------------------------------------------------------------------
## Education 
##        n  missing distinct 
##     2211        0        6 
##                                                                            
## Value           0_Basic 1_HighSchool    2_College       3_Grad         4_PG
## Frequency            29          385          636          645          442
## Proportion        0.013        0.174        0.288        0.292        0.200
##                        
## Value             5_PhD
## Frequency            74
## Proportion        0.033
## --------------------------------------------------------------------------------
## Education_S 
##        n  missing distinct 
##     2211        0        4 
##                                                                             
## Value      0_Basic_or_HighSchool             1_College                2_Grad
## Frequency                    414                   636                   645
## Proportion                 0.187                 0.288                 0.292
##                                 
## Value                3_PG_or_PhD
## Frequency                    516
## Proportion                 0.233
## --------------------------------------------------------------------------------
## Education_S2 
##        n  missing distinct 
##     2211        0        4 
##                                                                             
## Value      0_Basic_or_HighSchool             1_College                2_Grad
## Frequency                    414                   636                   645
## Proportion                 0.187                 0.288                 0.292
##                                 
## Value                3_PG_or_PhD
## Frequency                    516
## Proportion                 0.233
## --------------------------------------------------------------------------------
## Animal_Career 
##        n  missing distinct 
##     2211        0        6 
## 
## Animal breeder (15, 0.007), Animal trainer (122, 0.055), None of the above
## (1786, 0.808), Pet industry worker (181, 0.082), Veterinarian (57, 0.026),
## Veterinary technician or nurse (50, 0.023)
## --------------------------------------------------------------------------------
## Animal_Career2 
##        n  missing distinct 
##     2211        0        4 
##                                                                       
## Value        None of the above    Vet professional     breeder/trainer
## Frequency                 1786                 107                 137
## Proportion               0.808               0.048               0.062
##                               
## Value      Pet industry worker
## Frequency                  181
## Proportion               0.082
## --------------------------------------------------------------------------------
## Animal_Career_BINARY 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1786   425
## Proportion 0.808 0.192
## --------------------------------------------------------------------------------
## Income 
##        n  missing distinct 
##     2211        0        3 
##                                
## Value         Low Medium   High
## Frequency     352   1530    329
## Proportion  0.159  0.692  0.149
## --------------------------------------------------------------------------------
## Income2 
##        n  missing distinct 
##     2211        0        3 
##                                
## Value         Low Medium   High
## Frequency     352   1530    329
## Proportion  0.159  0.692  0.149
## --------------------------------------------------------------------------------
## C_Age 
##        n  missing distinct 
##     2211        0        5 
##                                         
## Value        <30 30–39 40–49 50–59   60<
## Frequency    340   482   452   515   422
## Proportion 0.154 0.218 0.204 0.233 0.191
## --------------------------------------------------------------------------------
## C_Age2 
##        n  missing distinct 
##     2211        0        5 
##                                         
## Value        <30 30–39 40–49 50–59   60<
## Frequency    340   482   452   515   422
## Proportion 0.154 0.218 0.204 0.233 0.191
## --------------------------------------------------------------------------------
## C_Gender 
##        n  missing distinct 
##     2211        0        2 
##                         
## Value      Female   Male
## Frequency    2055    156
## Proportion  0.929  0.071
## --------------------------------------------------------------------------------
## C_Diet 
##        n  missing distinct 
##     2211        0        5 
## 
## lowest : Omnivore (i.e. eat meat)                        Omnivore reducing animal product consumption    Pescatarian (including fish but no other meats) Vegan (consuming no animal products)            Vegetarian (consuming plants, eggs and milk)   
## highest: Omnivore (i.e. eat meat)                        Omnivore reducing animal product consumption    Pescatarian (including fish but no other meats) Vegan (consuming no animal products)            Vegetarian (consuming plants, eggs and milk)   
## --------------------------------------------------------------------------------
## C_Diet_Vegan 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1718   493
## Proportion 0.777 0.223
## --------------------------------------------------------------------------------
## C_Diet_Vegan_Veggie 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1497   714
## Proportion 0.677 0.323
## --------------------------------------------------------------------------------
## D_Age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##     2211        0       19    0.993    6.177    4.103        1        2 
##      .25      .50      .75      .90      .95 
##        3        6        9       11       13 
##                                                                             
## Value          1     2     3     4     5     6     7     8     9    10    11
## Frequency    143   243   245   229   229   195   184   171   137   127    97
## Proportion 0.065 0.110 0.111 0.104 0.104 0.088 0.083 0.077 0.062 0.057 0.044
##                                                           
## Value         12    13    14    15    16    17    18    20
## Frequency     67    54    48    26     7     7     1     1
## Proportion 0.030 0.024 0.022 0.012 0.003 0.003 0.000 0.000
## 
## For the frequency table, variable is rounded to the nearest 0
## --------------------------------------------------------------------------------
## D_Age_quant 
##        n  missing distinct 
##     2211        0        5 
##                                                        
## Value         [1,3]    (3,5]    (5,7]  (7,9.4] (9.4,20]
## Frequency       631      458      379      308      435
## Proportion    0.285    0.207    0.171    0.139    0.197
## --------------------------------------------------------------------------------
## Size 
##        n  missing distinct 
##     2211        0        5 
##                                              
## Value         Toy  Small Medium  Large  Giant
## Frequency      52    439    859    764     97
## Proportion  0.024  0.199  0.389  0.346  0.044
## --------------------------------------------------------------------------------
## Size2 
##        n  missing distinct 
##     2211        0        5 
##                                              
## Value      Medium    Toy  Small  Large  Giant
## Frequency     859     52    439    764     97
## Proportion  0.389  0.024  0.199  0.346  0.044
## --------------------------------------------------------------------------------
## Size_Giant 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   2114    97
## Proportion 0.956 0.044
## --------------------------------------------------------------------------------
## D_Sex 
##        n  missing distinct 
##     2211        0        2 
##                         
## Value      Female   Male
## Frequency    1042   1169
## Proportion  0.471  0.529
## --------------------------------------------------------------------------------
## D_Neuter 
##        n  missing distinct 
##     2211        0        2 
##                             
## Value        Intact Neutered
## Frequency       486     1725
## Proportion     0.22     0.78
## --------------------------------------------------------------------------------
## Therapeutic_Food 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   2111   100
## Proportion 0.955 0.045
## --------------------------------------------------------------------------------
## Meds 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1331   880
## Proportion 0.602 0.398
## --------------------------------------------------------------------------------
## D_Diet 
##        n  missing distinct 
##     2211        0        4 
## 
## Meat-based – conventional (1152, 0.521), Meat-based – raw (732, 0.331), Vegan
## (consuming no animal products) (295, 0.133), Vegetarian (including eggs or
## milk, but not meat) (32, 0.014)
## --------------------------------------------------------------------------------
## D_Diet_Vegan 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1916   295
## Proportion 0.867 0.133
## --------------------------------------------------------------------------------
## D_Diet_Vegan_Veggie 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1884   327
## Proportion 0.852 0.148
## --------------------------------------------------------------------------------
## D_Diet_Raw 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1479   732
## Proportion 0.669 0.331
## --------------------------------------------------------------------------------
## Visits 
##        n  missing distinct 
##     2211        0        5 
##                                         
## Value          0     1     2     3    3<
## Frequency    399   873   473   185   281
## Proportion 0.180 0.395 0.214 0.084 0.127
## --------------------------------------------------------------------------------
## Visits2 
##        n  missing distinct 
##     2211        0        4 
##                                   
## Value         0v    1v    2v    3v
## Frequency    399   873   473   466
## Proportion 0.180 0.395 0.214 0.211
## --------------------------------------------------------------------------------
## Health 
##        n  missing distinct 
##     2211        0        4 
## 
## --------------------------------------------------------------------------------
## Health2 
##        n  missing distinct 
##     2211        0        3 
##                                                     
## Value             0_None        1_Mild 2_Significant
## Frequency           1375           714           122
## Proportion         0.622         0.323         0.055
## --------------------------------------------------------------------------------
## Health_Binary 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     2211        0        2    0.156      122  0.05518   0.1043 
## 
## --------------------------------------------------------------------------------
## Any_Health_Problem 
##        n  missing distinct     Info      Sum     Mean      Gmd 
##     2211        0        2    0.705      836   0.3781   0.4705 
## 
## --------------------------------------------------------------------------------
## Location_Europe 
##        n  missing distinct 
##     2211        0        2 
##                       
## Value         No   Yes
## Frequency   1893   318
## Proportion 0.856 0.144
## --------------------------------------------------------------------------------

check C_Diet category numbers

dplyr::count(ml, C_Diet)

check D_Sex by D_Neuter

## two way cross tabs (xtabs) and flatten the table
ftable(xtabs(~D_Sex + D_Neuter, data = ml))
##        D_Neuter Intact Neutered
## D_Sex                          
## Female             192      850
## Male               294      875

Finally, create train and test datasets for use in logistic regression analyses

index <- sample(nrow(ml), nrow(ml) * 0.75)
# set seed to make partition
set.seed(123)
ml_train = ml[index, ]
ml_test = ml[-index, ]

Citations for packages

citation("stringr")
## 
## To cite package 'stringr' in publications use:
## 
##   Wickham H (2022). _stringr: Simple, Consistent Wrappers for Common
##   String Operations_. R package version 1.5.0,
##   <https://CRAN.R-project.org/package=stringr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {stringr: Simple, Consistent Wrappers for Common String Operations},
##     author = {Hadley Wickham},
##     year = {2022},
##     note = {R package version 1.5.0},
##     url = {https://CRAN.R-project.org/package=stringr},
##   }
packageVersion("stringr")
## [1] '1.5.0'
citation("dplyr")
## 
## To cite package 'dplyr' in publications use:
## 
##   Wickham H, François R, Henry L, Müller K, Vaughan D (2023). _dplyr: A
##   Grammar of Data Manipulation_. R package version 1.1.3,
##   <https://CRAN.R-project.org/package=dplyr>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {dplyr: A Grammar of Data Manipulation},
##     author = {Hadley Wickham and Romain François and Lionel Henry and Kirill Müller and Davis Vaughan},
##     year = {2023},
##     note = {R package version 1.1.3},
##     url = {https://CRAN.R-project.org/package=dplyr},
##   }
packageVersion("dplyr")
## [1] '1.1.3'
citation("DescTools")
## 
## To cite package 'DescTools' in publications use:
## 
##   Signorell A (2023). _DescTools: Tools for Descriptive Statistics_. R
##   package version 0.99.50,
##   <https://CRAN.R-project.org/package=DescTools>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {DescTools: Tools for Descriptive Statistics},
##     author = {Andri Signorell},
##     year = {2023},
##     note = {R package version 0.99.50},
##     url = {https://CRAN.R-project.org/package=DescTools},
##   }
packageVersion("DescTools")
## [1] '0.99.50'
citation("car")
## 
## To cite the car package in publications use:
## 
##   Fox J, Weisberg S (2019). _An R Companion to Applied Regression_,
##   Third edition. Sage, Thousand Oaks CA.
##   <https://socialsciences.mcmaster.ca/jfox/Books/Companion/>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     title = {An {R} Companion to Applied Regression},
##     edition = {Third},
##     author = {John Fox and Sanford Weisberg},
##     year = {2019},
##     publisher = {Sage},
##     address = {Thousand Oaks {CA}},
##     url = {https://socialsciences.mcmaster.ca/jfox/Books/Companion/},
##   }
packageVersion("car")
## [1] '3.1.2'
citation("foreign")
## 
## To cite package 'foreign' in publications use:
## 
##   R Core Team (2023). _foreign: Read Data Stored by 'Minitab', 'S',
##   'SAS', 'SPSS', 'Stata', 'Systat', 'Weka', 'dBase', ..._. R package
##   version 0.8-85, <https://CRAN.R-project.org/package=foreign>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {foreign: Read Data Stored by 'Minitab', 'S', 'SAS', 'SPSS', 'Stata',
## 'Systat', 'Weka', 'dBase', ...},
##     author = {{R Core Team}},
##     year = {2023},
##     note = {R package version 0.8-85},
##     url = {https://CRAN.R-project.org/package=foreign},
##   }
packageVersion("foreign")
## [1] '0.8.85'
citation("ggplot2")
## 
## To cite ggplot2 in publications, please use
## 
##   H. Wickham. ggplot2: Elegant Graphics for Data Analysis.
##   Springer-Verlag New York, 2016.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Book{,
##     author = {Hadley Wickham},
##     title = {ggplot2: Elegant Graphics for Data Analysis},
##     publisher = {Springer-Verlag New York},
##     year = {2016},
##     isbn = {978-3-319-24277-4},
##     url = {https://ggplot2.tidyverse.org},
##   }
packageVersion("ggplot2")
## [1] '3.4.3'
citation("Hmisc")
## 
## To cite package 'Hmisc' in publications use:
## 
##   Harrell Jr F (2023). _Hmisc: Harrell Miscellaneous_. R package
##   version 5.1-1, <https://CRAN.R-project.org/package=Hmisc>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {Hmisc: Harrell Miscellaneous},
##     author = {Frank E {Harrell Jr}},
##     year = {2023},
##     note = {R package version 5.1-1},
##     url = {https://CRAN.R-project.org/package=Hmisc},
##   }
packageVersion("Hmisc")
## [1] '5.1.1'
citation("reshape2")
## 
## To cite reshape2 in publications use:
## 
##   Hadley Wickham (2007). Reshaping Data with the reshape Package.
##   Journal of Statistical Software, 21(12), 1-20. URL
##   http://www.jstatsoft.org/v21/i12/.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Reshaping Data with the {reshape} Package},
##     author = {Hadley Wickham},
##     journal = {Journal of Statistical Software},
##     year = {2007},
##     volume = {21},
##     number = {12},
##     pages = {1--20},
##     url = {http://www.jstatsoft.org/v21/i12/},
##   }
packageVersion("reshape2")
## [1] '1.4.4'
citation("tidyverse")
## 
## To cite package 'tidyverse' in publications use:
## 
##   Wickham H, Averick M, Bryan J, Chang W, McGowan LD, François R,
##   Grolemund G, Hayes A, Henry L, Hester J, Kuhn M, Pedersen TL, Miller
##   E, Bache SM, Müller K, Ooms J, Robinson D, Seidel DP, Spinu V,
##   Takahashi K, Vaughan D, Wilke C, Woo K, Yutani H (2019). "Welcome to
##   the tidyverse." _Journal of Open Source Software_, *4*(43), 1686.
##   doi:10.21105/joss.01686 <https://doi.org/10.21105/joss.01686>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Welcome to the {tidyverse}},
##     author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani},
##     year = {2019},
##     journal = {Journal of Open Source Software},
##     volume = {4},
##     number = {43},
##     pages = {1686},
##     doi = {10.21105/joss.01686},
##   }
packageVersion("tidyverse")
## [1] '2.0.0'
citation("broom")
## 
## To cite package 'broom' in publications use:
## 
##   Robinson D, Hayes A, Couch S (2023). _broom: Convert Statistical
##   Objects into Tidy Tibbles_. R package version 1.0.5,
##   <https://CRAN.R-project.org/package=broom>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {broom: Convert Statistical Objects into Tidy Tibbles},
##     author = {David Robinson and Alex Hayes and Simon Couch},
##     year = {2023},
##     note = {R package version 1.0.5},
##     url = {https://CRAN.R-project.org/package=broom},
##   }
packageVersion("broom")
## [1] '1.0.5'
citation("aod")
## 
## To cite package 'aod' in publications use:
## 
##   Lesnoff, M., Lancelot, R. (2012). aod: Analysis of Overdispersed
##   Data. R package version 1.3.2, URL
##   http://cran.r-project.org/package=aod
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {aod: Analysis of Overdispersed Data},
##     author = {{Lesnoff} and {M.} and {Lancelot} and {R.}},
##     year = {2012},
##     note = {R package version 1.3.2},
##     url = {https://cran.r-project.org/package=aod},
##   }
packageVersion("aod")
## [1] '1.3.2'
citation("glmtoolbox")
## 
## To cite package 'glmtoolbox' in publications use:
## 
##   Vanegas L, Rondón L, Paula G (2023). _glmtoolbox: Set of Tools to
##   Data Analysis using Generalized Linear Models_. R package version
##   0.1.9, <https://CRAN.R-project.org/package=glmtoolbox>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {glmtoolbox: Set of Tools to Data Analysis using Generalized Linear Models},
##     author = {Luis Hernando Vanegas and Luz Marina Rondón and Gilberto A. Paula},
##     year = {2023},
##     note = {R package version 0.1.9},
##     url = {https://CRAN.R-project.org/package=glmtoolbox},
##   }
packageVersion("glmtoolbox")
## [1] '0.1.9'
citation("fmsb")
## 
## To cite package 'fmsb' in publications use:
## 
##   Nakazawa M (2023). _fmsb: Functions for Medical Statistics Book with
##   some Demographic Data_. R package version 0.7.5,
##   <https://CRAN.R-project.org/package=fmsb>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {fmsb: Functions for Medical Statistics Book with some Demographic Data},
##     author = {Minato Nakazawa},
##     year = {2023},
##     note = {R package version 0.7.5},
##     url = {https://CRAN.R-project.org/package=fmsb},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
packageVersion("fmsb")
## [1] '0.7.5'
citation("uwot")
## 
## To cite package 'uwot' in publications use:
## 
##   Melville J (2023). _uwot: The Uniform Manifold Approximation and
##   Projection (UMAP) Method for Dimensionality Reduction_. R package
##   version 0.1.16, <https://CRAN.R-project.org/package=uwot>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {uwot: The Uniform Manifold Approximation and Projection (UMAP) Method
## for Dimensionality Reduction},
##     author = {James Melville},
##     year = {2023},
##     note = {R package version 0.1.16},
##     url = {https://CRAN.R-project.org/package=uwot},
##   }
packageVersion("uwot")
## [1] '0.1.16'
citation("ggthemes")
## 
## To cite package 'ggthemes' in publications use:
## 
##   Arnold J (2021). _ggthemes: Extra Themes, Scales and Geoms for
##   'ggplot2'_. R package version 4.2.4,
##   <https://CRAN.R-project.org/package=ggthemes>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggthemes: Extra Themes, Scales and Geoms for 'ggplot2'},
##     author = {Jeffrey B. Arnold},
##     year = {2021},
##     note = {R package version 4.2.4},
##     url = {https://CRAN.R-project.org/package=ggthemes},
##   }
packageVersion("ggthemes")
## [1] '4.2.4'
citation("ggforce")
## 
## To cite package 'ggforce' in publications use:
## 
##   Pedersen T (2022). _ggforce: Accelerating 'ggplot2'_. R package
##   version 0.4.1, <https://CRAN.R-project.org/package=ggforce>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {ggforce: Accelerating 'ggplot2'},
##     author = {Thomas Lin Pedersen},
##     year = {2022},
##     note = {R package version 0.4.1},
##     url = {https://CRAN.R-project.org/package=ggforce},
##   }
packageVersion("ggforce")
## [1] '0.4.1'
citation("readxl")
## 
## To cite package 'readxl' in publications use:
## 
##   Wickham H, Bryan J (2023). _readxl: Read Excel Files_. R package
##   version 1.4.3, <https://CRAN.R-project.org/package=readxl>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {readxl: Read Excel Files},
##     author = {Hadley Wickham and Jennifer Bryan},
##     year = {2023},
##     note = {R package version 1.4.3},
##     url = {https://CRAN.R-project.org/package=readxl},
##   }
packageVersion("readxl")
## [1] '1.4.3'
citation("ROCR")
## 
## To cite ROCR in publications use:
## 
##   Sing T, Sander O, Beerenwinkel N, Lengauer T (2005). "ROCR:
##   visualizing classifier performance in R." _Bioinformatics_, *21*(20),
##   7881. <http://rocr.bioinf.mpi-sb.mpg.de>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     entry = {article},
##     title = {ROCR: visualizing classifier performance in R},
##     author = {T. Sing and O. Sander and N. Beerenwinkel and T. Lengauer},
##     year = {2005},
##     journal = {Bioinformatics},
##     volume = {21},
##     number = {20},
##     pages = {7881},
##     url = {http://rocr.bioinf.mpi-sb.mpg.de},
##   }
## 
## We have invested a lot of time and effort in creating ROCR, please cite
## it when using it for data analysis.
packageVersion("ROCR")
## [1] '1.0.11'
citation("pROC")
## 
## If you use pROC in published research, please cite the following paper:
## 
##   Xavier Robin, Natacha Turck, Alexandre Hainard, Natalia Tiberti,
##   Frédérique Lisacek, Jean-Charles Sanchez and Markus Müller (2011).
##   pROC: an open-source package for R and S+ to analyze and compare ROC
##   curves. BMC Bioinformatics, 12, p. 77.  DOI: 10.1186/1471-2105-12-77
##   <http://www.biomedcentral.com/1471-2105/12/77/>
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {pROC: an open-source package for R and S+ to analyze and compare ROC curves},
##     author = {Xavier Robin and Natacha Turck and Alexandre Hainard and Natalia Tiberti and Frédérique Lisacek and Jean-Charles Sanchez and Markus Müller},
##     year = {2011},
##     journal = {BMC Bioinformatics},
##     volume = {12},
##     pages = {77},
##   }
packageVersion("pROC")
## [1] '1.18.4'
citation("caret")
## 
## To cite caret in publications use:
## 
##   Kuhn, M. (2008). Building Predictive Models in R Using the caret
##   Package. Journal of Statistical Software, 28(5), 1–26.
##   https://doi.org/10.18637/jss.v028.i05
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     title = {Building Predictive Models in R Using the caret Package},
##     volume = {28},
##     url = {https://www.jstatsoft.org/index.php/jss/article/view/v028i05},
##     doi = {10.18637/jss.v028.i05},
##     number = {5},
##     journal = {Journal of Statistical Software},
##     author = {{Kuhn} and {Max}},
##     year = {2008},
##     pages = {1–26},
##   }
packageVersion("caret")
## [1] '6.0.94'
citation("xgboost")
## 
## To cite package 'xgboost' in publications use:
## 
##   Chen T, He T, Benesty M, Khotilovich V, Tang Y, Cho H, Chen K,
##   Mitchell R, Cano I, Zhou T, Li M, Xie J, Lin M, Geng Y, Li Y, Yuan J
##   (2023). _xgboost: Extreme Gradient Boosting_. R package version
##   1.7.5.1, <https://CRAN.R-project.org/package=xgboost>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {xgboost: Extreme Gradient Boosting},
##     author = {Tianqi Chen and Tong He and Michael Benesty and Vadim Khotilovich and Yuan Tang and Hyunsu Cho and Kailong Chen and Rory Mitchell and Ignacio Cano and Tianyi Zhou and Mu Li and Junyuan Xie and Min Lin and Yifeng Geng and Yutian Li and Jiaming Yuan},
##     year = {2023},
##     note = {R package version 1.7.5.1},
##     url = {https://CRAN.R-project.org/package=xgboost},
##   }
packageVersion("xgboost")
## [1] '1.7.5.1'
citation("smotefamily")
## 
## To cite package 'smotefamily' in publications use:
## 
##   Siriseriwan W (2019). _smotefamily: A Collection of Oversampling
##   Techniques for Class Imbalance Problem Based on SMOTE_. R package
##   version 1.3.1, <https://CRAN.R-project.org/package=smotefamily>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {smotefamily: A Collection of Oversampling Techniques for Class Imbalance
## Problem Based on SMOTE},
##     author = {Wacharasak Siriseriwan},
##     year = {2019},
##     note = {R package version 1.3.1},
##     url = {https://CRAN.R-project.org/package=smotefamily},
##   }
packageVersion("smotefamily")
## [1] '1.3.1'
citation("rsample")
## 
## To cite package 'rsample' in publications use:
## 
##   Frick H, Chow F, Kuhn M, Mahoney M, Silge J, Wickham H (2023).
##   _rsample: General Resampling Infrastructure_. R package version
##   1.2.0, <https://CRAN.R-project.org/package=rsample>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {rsample: General Resampling Infrastructure},
##     author = {Hannah Frick and Fanny Chow and Max Kuhn and Michael Mahoney and Julia Silge and Hadley Wickham},
##     year = {2023},
##     note = {R package version 1.2.0},
##     url = {https://CRAN.R-project.org/package=rsample},
##   }
packageVersion("rsample")
## [1] '1.2.0'
citation("mltools")
## 
## To cite package 'mltools' in publications use:
## 
##   Gorman B (2018). _mltools: Machine Learning Tools_. R package version
##   0.3.5, <https://CRAN.R-project.org/package=mltools>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {mltools: Machine Learning Tools},
##     author = {Ben Gorman},
##     year = {2018},
##     note = {R package version 0.3.5},
##     url = {https://CRAN.R-project.org/package=mltools},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
packageVersion("mltools")
## [1] '0.3.5'
citation("data.table")
## 
## To cite package 'data.table' in publications use:
## 
##   Dowle M, Srinivasan A (2023). _data.table: Extension of
##   `data.frame`_. R package version 1.14.8,
##   <https://CRAN.R-project.org/package=data.table>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {data.table: Extension of `data.frame`},
##     author = {Matt Dowle and Arun Srinivasan},
##     year = {2023},
##     note = {R package version 1.14.8},
##     url = {https://CRAN.R-project.org/package=data.table},
##   }
packageVersion("data.table")
## [1] '1.14.8'
citation("heatmaply")
## 
## The methods within the package can be cited as:
## 
##   Galili, Tal, O'Callaghan, Alan, Sidi, Jonathan, Sievert, Carson
##   (2017). "heatmaply: an R package for creating interactive cluster
##   heatmaps for online publishing." _Bioinformatics_.
##   doi:10.1093/bioinformatics/btx657
##   <https://doi.org/10.1093/bioinformatics/btx657>,
##   <https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btx657/21358327/btx657.pdf>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Article{,
##     author = {{Galili} and {Tal} and {O'Callaghan} and {Alan} and {Sidi} and {Jonathan} and {Sievert} and {Carson}},
##     title = {heatmaply: an R package for creating interactive cluster heatmaps for online publishing},
##     journal = {Bioinformatics},
##     year = {2017},
##     doi = {10.1093/bioinformatics/btx657},
##     url = {https://academic.oup.com/bioinformatics/article-pdf/doi/10.1093/bioinformatics/btx657/21358327/btx657.pdf},
##   }
## 
## This free open-source software implements academic research by the
## authors and co-workers. If you use it, please support the project by
## citing the appropriate journal articles.
packageVersion("heatmaply")
## [1] '1.5.0'
citation("rstatix")
## 
## To cite package 'rstatix' in publications use:
## 
##   Kassambara A (2023). _rstatix: Pipe-Friendly Framework for Basic
##   Statistical Tests_. R package version 0.7.2,
##   <https://CRAN.R-project.org/package=rstatix>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {rstatix: Pipe-Friendly Framework for Basic Statistical Tests},
##     author = {Alboukadel Kassambara},
##     year = {2023},
##     note = {R package version 0.7.2},
##     url = {https://CRAN.R-project.org/package=rstatix},
##   }
packageVersion("rstatix")
## [1] '0.7.2'
citation("dbscan")
## 
## Hahsler M, Piekenbrock M (2022). _dbscan: Density-Based Spatial
## Clustering of Applications with Noise (DBSCAN) and Related Algorithms_.
## R package version 1.1-11, <https://CRAN.R-project.org/package=dbscan>.
## 
## To cite dbscan in publications use:
## 
##   Hahsler M, Piekenbrock M, Doran D (2019). "dbscan: Fast Density-Based
##   Clustering with R." _Journal of Statistical Software_, *91*(1), 1-30.
##   doi:10.18637/jss.v091.i01 <https://doi.org/10.18637/jss.v091.i01>.
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
packageVersion("dbscan")
## [1] '1.1.11'
citation("corrplot")
## 
## To cite corrplot in publications use:
## 
##   Taiyun Wei and Viliam Simko (2021). R package 'corrplot':
##   Visualization of a Correlation Matrix (Version 0.92). Available from
##   https://github.com/taiyun/corrplot
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{corrplot2021,
##     title = {R package 'corrplot': Visualization of a Correlation Matrix},
##     author = {Taiyun Wei and Viliam Simko},
##     year = {2021},
##     note = {(Version 0.92)},
##     url = {https://github.com/taiyun/corrplot},
##   }
packageVersion("corrplot")
## [1] '0.92'
citation("showtext")
## 
## To cite package 'showtext' in publications use:
## 
##   Qiu Y, details. aotisSfAf (2023). _showtext: Using Fonts More Easily
##   in R Graphs_. R package version 0.9-6,
##   <https://CRAN.R-project.org/package=showtext>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {showtext: Using Fonts More Easily in R Graphs},
##     author = {Yixuan Qiu and authors/contributors of the included software. See file AUTHORS for details.},
##     year = {2023},
##     note = {R package version 0.9-6},
##     url = {https://CRAN.R-project.org/package=showtext},
##   }
## 
## ATTENTION: This citation information has been auto-generated from the
## package DESCRIPTION file and may need manual editing, see
## 'help("citation")'.
packageVersion("showtext")
## [1] '0.9.6'
citation("cluster")
## 
## To cite the R package 'cluster' in publications use:
## 
##   Maechler, M., Rousseeuw, P., Struyf, A., Hubert, M., Hornik,
##   K.(2022).  cluster: Cluster Analysis Basics and Extensions. R package
##   version 2.1.4.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {cluster: Cluster Analysis Basics and Extensions},
##     author = {Martin Maechler and Peter Rousseeuw and Anja Struyf and Mia Hubert and Kurt Hornik},
##     year = {2022},
##     url = {https://CRAN.R-project.org/package=cluster},
##     note = {R package version 2.1.4 --- For new features, see the 'Changelog' file (in the package source)},
##   }
packageVersion("cluster")
## [1] '2.1.4'
citation("factoextra")
## 
## To cite package 'factoextra' in publications use:
## 
##   Kassambara A, Mundt F (2020). _factoextra: Extract and Visualize the
##   Results of Multivariate Data Analyses_. R package version 1.0.7,
##   <https://CRAN.R-project.org/package=factoextra>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {factoextra: Extract and Visualize the Results of Multivariate Data Analyses},
##     author = {Alboukadel Kassambara and Fabian Mundt},
##     year = {2020},
##     note = {R package version 1.0.7},
##     url = {https://CRAN.R-project.org/package=factoextra},
##   }
packageVersion("factoextra")
## [1] '1.0.7'
citation("clusterSim")
## 
## To cite clusterSim in publications use:
## 
##   Walesiak M, Dudek A (2020). "The Choice of Variable Normalization
##   Method in Cluster Analysis." In Soliman KS (ed.), _Education
##   Excellence and Innovation Management: A 2025 Vision to Sustain
##   Economic Development During Global Challenges_, 325-340. ISBN
##   978-0-9998551-4-1.
## 
## A BibTeX entry for LaTeX users is
## 
##   @InProceedings{,
##     title = {The Choice of Variable Normalization Method in Cluster Analysis},
##     author = {Marek Walesiak and Andrzej Dudek},
##     booktitle = {Education Excellence and Innovation Management: A 2025 Vision to Sustain Economic Development During Global Challenges},
##     booksubtitle = {Proceedings of the 35th International Business Information Management Association Conference (IBIMA), 1-2 April 2020, Seville, Spain},
##     year = {2020},
##     publisher = {International Business Information Management Association (IBIMA)},
##     editor = {Khalid S. Soliman},
##     pages = {325-340},
##     isbn = {978-0-9998551-4-1},
##   }
packageVersion("clusterSim")
## [1] '0.51.3'
citation("RColorBrewer")
## 
## To cite package 'RColorBrewer' in publications use:
## 
##   Neuwirth E (2022). _RColorBrewer: ColorBrewer Palettes_. R package
##   version 1.1-3, <https://CRAN.R-project.org/package=RColorBrewer>.
## 
## A BibTeX entry for LaTeX users is
## 
##   @Manual{,
##     title = {RColorBrewer: ColorBrewer Palettes},
##     author = {Erich Neuwirth},
##     year = {2022},
##     note = {R package version 1.1-3},
##     url = {https://CRAN.R-project.org/package=RColorBrewer},
##   }
packageVersion("RColorBrewer")
## [1] '1.1.3'
citation("IHW")
## 
## To cite package 'IHW' in publications use:
## 
##   Nikolaos Ignatiadis, Bernd Klaus, Judith Zaugg and Wolfgang Huber
##   (2016): Data-driven hypothesis weighting increases detection power in
##   genome-scale multiple testing. Nature Methods 13:577, doi:
##   10.1038/nmeth.3885
## 
##   Nikolaos Ignatiadis and Wolfgang Huber (2016): Covariate-powered
##   weighted multiple testing with false discovery rate control.
##   arXiv:1701.05179
## 
## To see these entries in BibTeX format, use 'print(<citation>,
## bibtex=TRUE)', 'toBibtex(.)', or set
## 'options(citation.bibtex.max=999)'.
packageVersion("IHW")
## [1] '1.26.0'
if (SAVE) {
    save(ml, file = "ml_read_data_101.RData")
}